library(arsenal)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(here)
## here() starts at /Users/stephcopeland/Documents/GitHub/222 Final Project/last take
library(tidyr)
library(DescTools)
##
## Attaching package: 'DescTools'
## The following objects are masked from 'package:arsenal':
##
## %nin%, N
library(tibble)
library(calecopal)
library(ggeffects)
library(gt)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(viridis)
## Loading required package: viridisLite
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
## Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
## if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
data1 <- read.csv("last_take.csv")
head(data1)
## country code area_sqkm population GDP_capita dalys
## 1 Afghanistan AFG 652860 36296111 519.8889 2.544699e+05
## 2 Angola AGO 1246700 29816769 4095.8101 8.006665e+05
## 3 Albania ALB 27400 2873457 4531.0208 1.543584e+02
## 4 Andorra AND 470 76997 38964.9045 3.942648e-01
## 5 Argentina ARG 2736690 44044811 14613.0418 2.894639e+04
## 6 Armenia ARM 28470 2944789 3914.5279 1.241430e+03
## tree_cover_loss_ha
## 1 25.70585
## 2 280456.32110
## 3 1689.02503
## 4 3.42216
## 5 233919.67780
## 6 41.74732
data2 <- data1 %>%
mutate(prop_tree_loss = (tree_cover_loss_ha/area_sqkm)) %>%
mutate(prop_dalys = (dalys/population))
head(data2)
## country code area_sqkm population GDP_capita dalys
## 1 Afghanistan AFG 652860 36296111 519.8889 2.544699e+05
## 2 Angola AGO 1246700 29816769 4095.8101 8.006665e+05
## 3 Albania ALB 27400 2873457 4531.0208 1.543584e+02
## 4 Andorra AND 470 76997 38964.9045 3.942648e-01
## 5 Argentina ARG 2736690 44044811 14613.0418 2.894639e+04
## 6 Armenia ARM 28470 2944789 3914.5279 1.241430e+03
## tree_cover_loss_ha prop_tree_loss prop_dalys
## 1 25.70585 3.937421e-05 7.010940e-03
## 2 280456.32110 2.249589e-01 2.685289e-02
## 3 1689.02503 6.164325e-02 5.371872e-05
## 4 3.42216 7.281192e-03 5.120522e-06
## 5 233919.67780 8.547540e-02 6.572033e-04
## 6 41.74732 1.466362e-03 4.215684e-04
data3 <- data2 %>%
mutate(per_tree_loss = (prop_tree_loss * 100)) %>%
mutate(per_dalys = (prop_dalys * 100))
head(data3)
## country code area_sqkm population GDP_capita dalys
## 1 Afghanistan AFG 652860 36296111 519.8889 2.544699e+05
## 2 Angola AGO 1246700 29816769 4095.8101 8.006665e+05
## 3 Albania ALB 27400 2873457 4531.0208 1.543584e+02
## 4 Andorra AND 470 76997 38964.9045 3.942648e-01
## 5 Argentina ARG 2736690 44044811 14613.0418 2.894639e+04
## 6 Armenia ARM 28470 2944789 3914.5279 1.241430e+03
## tree_cover_loss_ha prop_tree_loss prop_dalys per_tree_loss per_dalys
## 1 25.70585 3.937421e-05 7.010940e-03 0.003937421 0.7010939935
## 2 280456.32110 2.249589e-01 2.685289e-02 22.495894850 2.6852892837
## 3 1689.02503 6.164325e-02 5.371872e-05 6.164324938 0.0053718717
## 4 3.42216 7.281192e-03 5.120522e-06 0.728119233 0.0005120522
## 5 233919.67780 8.547540e-02 6.572033e-04 8.547540196 0.0657203287
## 6 41.74732 1.466362e-03 4.215684e-04 0.146636171 0.0421568401
ggplot(data3, aes(sample = prop_dalys)) +
geom_qq() + geom_qq_line()
## Warning: Removed 15 rows containing non-finite values (stat_qq).
## Warning: Removed 15 rows containing non-finite values (stat_qq_line).
ggplot(data3, aes(sample = dalys)) +
geom_qq() + geom_qq_line()
## Warning: Removed 13 rows containing non-finite values (stat_qq).
## Warning: Removed 13 rows containing non-finite values (stat_qq_line).
data3 <- data3 %>%
mutate(logdalys = log(dalys))
ggplot(data3, aes(sample = logdalys)) +
geom_qq() + geom_qq_line()
## Warning: Removed 13 rows containing non-finite values (stat_qq).
## Warning: Removed 13 rows containing non-finite values (stat_qq_line).
data4 <- data3 %>%
mutate(log_dalys = log(prop_dalys))
head(data4)
## country code area_sqkm population GDP_capita dalys
## 1 Afghanistan AFG 652860 36296111 519.8889 2.544699e+05
## 2 Angola AGO 1246700 29816769 4095.8101 8.006665e+05
## 3 Albania ALB 27400 2873457 4531.0208 1.543584e+02
## 4 Andorra AND 470 76997 38964.9045 3.942648e-01
## 5 Argentina ARG 2736690 44044811 14613.0418 2.894639e+04
## 6 Armenia ARM 28470 2944789 3914.5279 1.241430e+03
## tree_cover_loss_ha prop_tree_loss prop_dalys per_tree_loss per_dalys
## 1 25.70585 3.937421e-05 7.010940e-03 0.003937421 0.7010939935
## 2 280456.32110 2.249589e-01 2.685289e-02 22.495894850 2.6852892837
## 3 1689.02503 6.164325e-02 5.371872e-05 6.164324938 0.0053718717
## 4 3.42216 7.281192e-03 5.120522e-06 0.728119233 0.0005120522
## 5 233919.67780 8.547540e-02 6.572033e-04 8.547540196 0.0657203287
## 6 41.74732 1.466362e-03 4.215684e-04 0.146636171 0.0421568401
## logdalys log_dalys
## 1 12.4469377 -4.960284
## 2 13.5931998 -3.617382
## 3 5.0392773 -9.831749
## 4 -0.9307325 -12.182254
## 5 10.2732009 -7.327517
## 6 7.1240192 -7.771529
ggplot(data4, aes(sample = log_dalys)) +
geom_qq() + geom_qq_line()
## Warning: Removed 15 rows containing non-finite values (stat_qq).
## Warning: Removed 15 rows containing non-finite values (stat_qq_line).
data4 <- data4 %>%
mutate(log_GDP = log(GDP_capita))
mod1 <- lm(logdalys ~ prop_tree_loss, data = data4)
summary(mod1)
##
## Call:
## lm(formula = logdalys ~ prop_tree_loss, data = data4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.059 -2.194 0.059 2.519 7.314
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.1297 0.2579 35.397 <2e-16 ***
## prop_tree_loss -0.1450 0.1040 -1.395 0.165
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.343 on 174 degrees of freedom
## (13 observations deleted due to missingness)
## Multiple R-squared: 0.01105, Adjusted R-squared: 0.00537
## F-statistic: 1.945 on 1 and 174 DF, p-value: 0.1649
ggplot(data4, aes(y = log_dalys, x = prop_tree_loss)) +
geom_point() +
stat_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing missing values (geom_point).
data4 <- data4 %>%
mutate(win_prop_tree = Winsorize(prop_tree_loss, minval = NULL, maxval = NULL, probs = c(0.00, 0.95), na.rm = TRUE, type = 9))
graph1 <- ggplot(data4, aes(y = log_dalys, x = win_prop_tree)) +
geom_point() +
stat_smooth(method = "lm", se = FALSE)
print(graph1)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing missing values (geom_point).
mod2 <- lm(logdalys ~ win_prop_tree, data = data4)
summary(mod2)
##
## Call:
## lm(formula = logdalys ~ win_prop_tree, data = data4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.8125 -2.1737 -0.1597 2.6571 7.4237
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.8773 0.3137 28.297 <2e-16 ***
## win_prop_tree 0.6133 0.6480 0.946 0.345
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.353 on 174 degrees of freedom
## (13 observations deleted due to missingness)
## Multiple R-squared: 0.005121, Adjusted R-squared: -0.0005966
## F-statistic: 0.8957 on 1 and 174 DF, p-value: 0.3453
names(mod2)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "na.action" "xlevels" "call" "terms"
## [13] "model"
graph2 <- ggplot(data4, aes(y = log_dalys, x = win_prop_tree, color = log_GDP)) +
geom_point(aes(size = 0.5)) +
stat_smooth(method = "lm", se = FALSE)
print(graph2)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing missing values (geom_point).
mod3 <- lm(log_dalys ~ win_prop_tree + log_GDP, data = data4)
summary(mod3)
##
## Call:
## lm(formula = log_dalys ~ win_prop_tree + log_GDP, data = data4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.1512 -0.9453 0.2060 0.9732 4.7871
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.69968 0.78175 6.012 1.09e-08 ***
## win_prop_tree 0.71536 0.32810 2.180 0.0306 *
## log_GDP -1.36044 0.08909 -15.271 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.672 on 170 degrees of freedom
## (16 observations deleted due to missingness)
## Multiple R-squared: 0.5832, Adjusted R-squared: 0.5783
## F-statistic: 118.9 on 2 and 170 DF, p-value: < 2.2e-16
names(mod3)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "na.action" "xlevels" "call" "terms"
## [13] "model"
graph3 <- ggplot(data4, aes(y = log_dalys, x = log_GDP, color = win_prop_tree)) +
geom_point(aes(size = 0.5)) +
stat_smooth(method = "lm", se = FALSE)
print(graph3)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 16 rows containing non-finite values (stat_smooth).
## Warning: Removed 16 rows containing missing values (geom_point).
data5 <- data4 %>%
mutate(country_income_level =
case_when(GDP_capita >= 13000 ~ "high",
GDP_capita >= 4000 ~ "middle",
GDP_capita <= 4000 ~ "low"))
data5 %>%
group_by(country_income_level) %>%
summarize(n = n())
## # A tibble: 4 × 2
## country_income_level n
## <chr> <int>
## 1 high 52
## 2 low 72
## 3 middle 51
## 4 <NA> 14
equLOWcountries <- c("Burundi", "Benin", "Bhangladesh", "Bolivia", "Central African Republic", "Cote d'Ivoire", "Cameroon", "Democratic Republic of Congo", "Congo", "Comoros", "Ethiopia", "Ghana", "Guinea", "Gambia", "Honduras", "Haiti", "Indonesia", "Kenya", "Cambodia", "Laos", "Liberia", "Madagascar", "Mali", "Myanmar", "Mozambique",
"Mauritania", "Malawi", "Nigeria", "Nicaragua", "Phillipines", "Rwanda", "Senegal", "Sierra Leone", "El Salvador", "Eswatini", "Togo", "Tanzania", "Uganda", "Vietnam", "Zambia", "Zimbabwe")
noneqLOWcountries <- c("AFG", "ARM", "BTN", "CPV", "EGY", "ERI", "FSM", "GNB", "IND", "KGZ", "LSO", "MAR", "MDA", "MNG", "NER", "NPL", "PAK", "PNG", "SDN", "SLB", "SSD", "STP", "SYR", "TCD", "TJK", "TLS", "TUN", "UKR", "UZB", "VUT")
data6 <- data5 %>%
filter(country_income_level == "low") %>%
filter(!(code %in% c(noneqLOWcountries)))
mod3 <- lm(log_dalys ~ win_prop_tree + log_GDP, data = data4)
mod4 <- lm(log_dalys ~ win_prop_tree, data = data6)
summary(mod4)
##
## Call:
## lm(formula = log_dalys ~ win_prop_tree, data = data6)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.499 -1.321 0.180 1.315 2.359
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.9986 0.3069 -13.030 5.6e-16 ***
## win_prop_tree -0.3044 0.4773 -0.638 0.527
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.417 on 40 degrees of freedom
## Multiple R-squared: 0.01007, Adjusted R-squared: -0.01468
## F-statistic: 0.4068 on 1 and 40 DF, p-value: 0.5272
ggplot(data6, aes(y = log_dalys, x = win_prop_tree)) +
geom_point(aes(color = GDP_capita, size = 1.0)) +
stat_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
mod5 <- lm(log_dalys ~ win_prop_tree + log_GDP, data = data6)
summary(mod5)
##
## Call:
## lm(formula = log_dalys ~ win_prop_tree + log_GDP, data = data6)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0354 -0.7776 -0.2154 0.8573 2.1058
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.89557 1.78369 2.745 0.00911 **
## win_prop_tree -0.01454 0.38072 -0.038 0.96973
## log_GDP -1.27217 0.25277 -5.033 1.13e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.117 on 39 degrees of freedom
## Multiple R-squared: 0.3999, Adjusted R-squared: 0.3691
## F-statistic: 12.99 on 2 and 39 DF, p-value: 4.743e-05
ggplot(data6, aes(y = log_dalys, x = GDP_capita)) +
geom_point(aes(color = win_prop_tree, size = 2)) +
stat_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
# Interactive version
p <- data5 %>%
mutate(GDP_capita=round(GDP_capita,0)) %>%
mutate(population=round(population/1000000,2)) %>%
mutate(log_dalys=round(log_dalys,1)) %>%
# Reorder countries to having big bubbles on top
arrange(desc(population)) %>%
#mutate(country = factor(country, country)) %>%
# prepare text for tooltip
mutate(text = paste("Country: ", country, "\nPopulation (M): ", population, "\n DALYs: ", log_dalys, "\nGdp per capita: ", GDP_capita, sep="")) %>%
# Classic ggplot
ggplot( aes(x=GDP_capita, y=log_dalys, size = population, color = country_income_level, text=text)) +
geom_point(alpha=0.5) +
scale_size(range = c(1.4, 19), name="Global DALYs") +
scale_color_viridis(discrete=TRUE, guide=FALSE) +
theme_ipsum() +
theme(legend.position="none")
pp <- ggplotly(p, tooltip="text")
pp
Hypothesis Testing